<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ArXiv CS.CV Papers (Image/Video Generation) - April 29, 2025</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/framer-motion/10.16.4/framer-motion.dev.js"></script>
    <!-- Example using Font Awesome (replace with your preferred icon library if needed) -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css">
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

        :root {
            /* New Palette: Light, Clean, Futuristic with Teal/Aqua accents */
            --bg-color: #f8fafc; /* Tailwind slate-50 (Very Light Gray) */
            --card-bg-color: #ffffff; /* White */
            --text-color: #1e293b; /* Tailwind slate-800 (Dark Gray-Blue) */
            --text-muted-color: #64748b; /* Tailwind slate-500 (Medium Gray-Blue) */
            --header-color: #0f172a; /* Tailwind slate-900 (Very Dark Blue) */
            --highlight-primary: #14b8a6; /* Tailwind teal-500 */
            --highlight-secondary: #67e8f9; /* Tailwind cyan-300 */
            --border-color: #e2e8f0; /* Tailwind slate-200 (Light Gray) */
            --shadow-color: rgba(15, 23, 42, 0.08); /* Subtle shadow based on slate-900 */
        }

        body {
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            overflow-x: hidden; /* Prevent horizontal scroll */
            line-height: 1.6;
        }

        .bento-grid {
            display: grid;
            gap: 1.5rem; /* Tailwind gap-6 */
            grid-template-columns: 1fr; /* Force single column */
            padding-bottom: 4rem; /* Add padding at the bottom */
        }

        .bento-item {
            /* Apply semi-transparent white background and blur */
            background-color: rgba(255, 255, 255, 0.7); /* White with 70% opacity */
            backdrop-filter: blur(10px); /* Apply blur effect */
            -webkit-backdrop-filter: blur(10px); /* Safari prefix */
            border-radius: 1rem; /* Slightly larger radius */
            padding: 1.75rem; /* Slightly more padding */
            border: 1px solid rgba(226, 232, 240, 0.5); /* Lighter border with transparency */
            box-shadow: 0 4px 12px var(--shadow-color);
            transition: transform 0.3s ease-out, box-shadow 0.3s ease-out, background-color 0.3s ease-out;
            overflow: hidden; /* Ensure content doesn't overflow */
            position: relative; /* For potential pseudo-elements */
        }

        /* Removed ::before pseudo-element for a cleaner look */


        .bento-item:hover {
            transform: translateY(-6px);
            box-shadow: 0 10px 20px var(--shadow-color), 0 4px 8px rgba(15, 23, 42, 0.06); /* Adjusted hover shadow */
        }

        .paper-title {
            font-size: 1.125rem; /* Tailwind text-lg */
            font-weight: 600; /* Tailwind font-semibold */
            color: var(--highlight-primary); /* Use new primary highlight */
            margin-bottom: 0.75rem; /* Tailwind mb-3 */
            line-height: 1.4;
        }

        .paper-summary {
            font-size: 0.875rem; /* Tailwind text-sm */
            color: var(--text-muted-color);
            margin-bottom: 1.25rem; /* Tailwind mb-5 */
            line-height: 1.6;
        }

        .paper-link {
            display: inline-flex; /* Use flex for icon alignment */
            align-items: center;
            font-size: 0.875rem; /* Tailwind text-sm */
            font-weight: 600;
            color: var(--highlight-primary);
            text-decoration: none;
            padding: 0.5rem 1rem; /* Add padding */
            border-radius: 0.5rem; /* Slightly rounder */
            background-color: rgba(20, 184, 166, 0.08); /* Subtle teal background */
            border: 1px solid rgba(20, 184, 166, 0.2);
            transition: background-color 0.3s ease, color 0.3s ease, transform 0.2s ease;
        }

        .paper-link i {
            margin-right: 0.5rem; /* Tailwind mr-2 */
            transition: transform 0.3s ease;
        }

        .paper-link:hover {
            background-color: rgba(20, 184, 166, 0.15);
            color: #0d9488; /* Darker teal on hover */
            transform: translateY(-1px);
        }
        .paper-link:hover i {
             transform: translateX(2px);
        }

        .paper-authors {
            font-size: 0.75rem; /* Tailwind text-xs */
            color: var(--text-muted-color);
            margin-top: 1rem; /* Tailwind mt-4 */
            font-style: italic;
        }

        .header {
            text-align: center;
            margin-bottom: 3rem; /* Tailwind mb-12 */
            padding-top: 3rem; /* Tailwind pt-12 */
        }

        .header h1 {
            font-size: 2.5rem; /* Tailwind text-4xl or 5xl */
            font-weight: 700; /* Tailwind font-bold */
            color: var(--header-color);
            letter-spacing: -0.025em; /* Tailwind tracking-tight */
            margin-bottom: 0.5rem;
            /* Optional: Add a subtle text gradient */
            /* background: linear-gradient(90deg, var(--highlight-primary), var(--highlight-secondary)); */
            /* -webkit-background-clip: text; */
            /* -webkit-text-fill-color: transparent; */
        }

        .header p {
            font-size: 1.125rem; /* Tailwind text-lg */
            color: var(--text-muted-color);
            margin-top: 0.5rem; /* Tailwind mt-2 */
            max-width: 600px;
            margin-left: auto;
            margin-right: auto;
        }

        .footer {
            text-align: center;
            color: var(--text-muted-color);
            font-size: 0.875rem; /* Tailwind text-sm */
            padding-top: 2rem;
            padding-bottom: 2rem; /* Tailwind py-8 */
            border-top: 1px solid var(--border-color);
            margin-top: 4rem;
        }

        /* Simple line graphic element (optional) */
        .line-graphic {
            height: 1px; /* Thinner line */
            background: linear-gradient(90deg, rgba(20, 184, 166, 0), var(--highlight-primary), rgba(20, 184, 166, 0));
            opacity: 0.6;
            margin: 1.5rem 0; /* Adjust margin */
        }

        /* Framer Motion requires the script, styles enhance appearance */
        [data-motion-element] {
             /* Base styles for elements animated by Framer Motion */
        }

        .paper-tldr {
            font-size: 0.95rem; /* Slightly bigger than summary */
            color: #475569; /* Changed to Tailwind slate-600 (slightly darker than summary) */
            margin-top: 0.75rem; /* Tailwind mt-3 */
            margin-bottom: 0.75rem; /* Tailwind mb-2 */
            /* font-style: italic; */
            font-weight: bold;
        }

        .paper-rating {
            margin-top: 1rem; /* Tailwind mt-4 */
            margin-bottom: 1rem; /* Tailwind mb-4 */
            color: #f59e0b; /* Tailwind amber-500 */
        }

        .paper-rating i {
            margin-right: 0.125rem; /* Tailwind mr-0.5 */
        }

        /* Apply consistent star color to sub-ratings */
        .paper-sub-ratings .rating-item i {
            color: #f59e0b; /* Match overall rating star color (amber-500) */
            margin-right: 0.125rem; /* Consistent spacing */
        }

    </style>
</head>
<body class="container mx-auto px-4 antialiased">

    <motion.div
        initial="{ opacity: 0, y: -30 }"
        animate="{ opacity: 1, y: 0 }"
        transition="{ duration: 0.6, ease: 'easeOut' }"
        class="header"
        data-motion-element
    >
        <h1>AIGC Daily Papers</h1>
        <p>Daily papers related to Image/Video/Multimodal Generation from cs.CV</p>
        <p>April 29, 2025</p>
        <div class="line-graphic mt-4 mb-8 mx-auto w-1/4"></div> <!-- Added line graphic -->
    </motion.div>

    <div class="bento-grid" id="paper-grid">
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Enhancing Surgical Documentation through Multimodal Visual-Temporal Transformers and Generative AI</h2>
            <p class="paper-summary">The automatic summarization of surgical videos is essential for enhancing
procedural documentation, supporting surgical training, and facilitating
post-operative analysis. This paper presents a novel method at the intersection
of artificial intelligence and medicine, aiming to develop machine learning
models with direct real-world applications in surgical contexts. We propose a
multi-modal framework that leverages recent advancements in computer vision and
large language models to generate comprehensive video summaries. % The approach
is structured in three key stages. First, surgical videos are divided into
clips, and visual features are extracted at the frame level using visual
transformers. This step focuses on detecting tools, tissues, organs, and
surgical actions. Second, the extracted features are transformed into
frame-level captions via large language models. These are then combined with
temporal features, captured using a ViViT-based encoder, to produce clip-level
summaries that reflect the broader context of each video segment. Finally, the
clip-level descriptions are aggregated into a full surgical report using a
dedicated LLM tailored for the summarization task. % We evaluate our method on
the CholecT50 dataset, using instrument and action annotations from 50
laparoscopic videos. The results show strong performance, achieving 96\%
precision in tool detection and a BERT score of 0.74 for temporal context
summarization. This work contributes to the advancement of AI-assisted tools
for surgical reporting, offering a step toward more intelligent and reliable
clinical documentation.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a multimodal ai framework for generating surgical video summaries using visual transformers, vivit, and llms, demonstrating strong performance on the cholect50 dataset for tool detection and temporal context summarization.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一个多模态ai框架，利用视觉transformer、vivit和llm生成手术视频摘要，并在cholect50数据集上展示了在工具检测和时间上下文摘要方面的强大性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19918v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Hugo Georgenthum, Cristian Cosentino, Fabrizio Marozzo, Pietro Liò</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">CineVerse: Consistent Keyframe Synthesis for Cinematic Scene Composition</h2>
            <p class="paper-summary">We present CineVerse, a novel framework for the task of cinematic scene
composition. Similar to traditional multi-shot generation, our task emphasizes
the need for consistency and continuity across frames. However, our task also
focuses on addressing challenges inherent to filmmaking, such as multiple
characters, complex interactions, and visual cinematic effects. In order to
learn to generate such content, we first create the CineVerse dataset. We use
this dataset to train our proposed two-stage approach. First, we prompt a large
language model (LLM) with task-specific instructions to take in a high-level
scene description and generate a detailed plan for the overall setting and
characters, as well as the individual shots. Then, we fine-tune a text-to-image
generation model to synthesize high-quality visual keyframes. Experimental
results demonstrate that CineVerse yields promising improvements in generating
visually coherent and contextually rich movie scenes, paving the way for
further exploration in cinematic video synthesis.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: cineverse introduces a framework and dataset for generating consistent cinematic scene keyframes using llms for scene planning and fine-tuned text-to-image models for synthesis, aiming to address the challenges of multi-character, complex interaction cinematic content creation.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: cineverse 提出了一个框架和数据集，利用llm进行场景规划，并微调文本到图像模型进行合成，从而生成一致的电影场景关键帧，旨在解决多角色、复杂互动电影内容创作的挑战。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19894v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Quynh Phung, Long Mai, Fabian David Caba Heilbron, Feng Liu, Jia-Bin Huang, Cusuh Ham</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">AnimateAnywhere: Rouse the Background in Human Image Animation</h2>
            <p class="paper-summary">Human image animation aims to generate human videos of given characters and
backgrounds that adhere to the desired pose sequence. However, existing methods
focus more on human actions while neglecting the generation of background,
which typically leads to static results or inharmonious movements. The
community has explored camera pose-guided animation tasks, yet preparing the
camera trajectory is impractical for most entertainment applications and
ordinary users. As a remedy, we present an AnimateAnywhere framework, rousing
the background in human image animation without requirements on camera
trajectories. In particular, based on our key insight that the movement of the
human body often reflects the motion of the background, we introduce a
background motion learner (BML) to learn background motions from human pose
sequences. To encourage the model to learn more accurate cross-frame
correspondences, we further deploy an epipolar constraint on the 3D attention
map. Specifically, the mask used to suppress geometrically unreasonable
attention is carefully constructed by combining an epipolar mask and the
current 3D attention map. Extensive experiments demonstrate that our
AnimateAnywhere effectively learns the background motion from human pose
sequences, achieving state-of-the-art performance in generating human animation
results with vivid and realistic backgrounds. The source code and model will be
available at https://github.com/liuxiaoyu1104/AnimateAnywhere.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces animateanywhere, a framework for human image animation that focuses on generating realistic background motion based on human pose sequences, using a background motion learner and epipolar constraints on attention maps.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了animateanywhere，一个基于人体姿势序列生成逼真背景运动的人体图像动画框架，该框架使用背景运动学习器和注意力图上的对极约束。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19834v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Xiaoyu Liu, Mingshuai Yao, Yabo Zhang, Xianhui Lin, Peiran Ren, Xiaoming Li, Ming Liu, Wangmeng Zuo</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.15000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">RepText: Rendering Visual Text via Replicating</h2>
            <p class="paper-summary">Although contemporary text-to-image generation models have achieved
remarkable breakthroughs in producing visually appealing images, their capacity
to generate precise and flexible typographic elements, especially non-Latin
alphabets, remains constrained. To address these limitations, we start from an
naive assumption that text understanding is only a sufficient condition for
text rendering, but not a necessary condition. Based on this, we present
RepText, which aims to empower pre-trained monolingual text-to-image generation
models with the ability to accurately render, or more precisely, replicate,
multilingual visual text in user-specified fonts, without the need to really
understand them. Specifically, we adopt the setting from ControlNet and
additionally integrate language agnostic glyph and position of rendered text to
enable generating harmonized visual text, allowing users to customize text
content, font and position on their needs. To improve accuracy, a text
perceptual loss is employed along with the diffusion loss. Furthermore, to
stabilize rendering process, at the inference phase, we directly initialize
with noisy glyph latent instead of random initialization, and adopt region
masks to restrict the feature injection to only the text region to avoid
distortion of the background. We conducted extensive experiments to verify the
effectiveness of our RepText relative to existing works, our approach
outperforms existing open-source methods and achieves comparable results to
native multi-language closed-source models. To be more fair, we also
exhaustively discuss its limitations in the end.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces reptext, a method for improving multilingual text rendering in text-to-image generation models by replicating glyphs and positions, achieving comparable results to closed-source models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了reptext，一种通过复制字形和位置来改进文本到图像生成模型中的多语言文本渲染的方法，其结果与闭源模型相当。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19724v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Haofan Wang, Yujia Xu, Yimeng Li, Junchen Li, Chaowei Zhang, Jing Wang, Kejia Yang, Zhibo Chen</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.2, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">DiVE: Efficient Multi-View Driving Scenes Generation Based on Video Diffusion Transformer</h2>
            <p class="paper-summary">Collecting multi-view driving scenario videos to enhance the performance of
3D visual perception tasks presents significant challenges and incurs
substantial costs, making generative models for realistic data an appealing
alternative. Yet, the videos generated by recent works suffer from poor quality
and spatiotemporal consistency, undermining their utility in advancing
perception tasks under driving scenarios. To address this gap, we propose DiVE,
a diffusion transformer-based generative framework meticulously engineered to
produce high-fidelity, temporally coherent, and cross-view consistent
multi-view videos, aligning seamlessly with bird's-eye view layouts and textual
descriptions. DiVE leverages a unified cross-attention and a SketchFormer to
exert precise control over multimodal data, while incorporating a view-inflated
attention mechanism that adds no extra parameters, thereby guaranteeing
consistency across views. Despite these advancements, synthesizing
high-resolution videos under multimodal constraints introduces dual challenges:
investigating the optimal classifier-free guidance coniguration under intricate
multi-condition inputs and mitigating excessive computational latency in
high-resolution rendering--both of which remain underexplored in prior
researches. To resolve these limitations, we introduce two innovations:
Multi-Control Auxiliary Branch Distillation, which streamlines multi-condition
CFG selection while circumventing high computational overhead, and Resolution
Progressive Sampling, a training-free acceleration strategy that staggers
resolution scaling to reduce high latency due to high resolution. These
innovations collectively achieve a 2.62x speedup with minimal quality
degradation. Evaluated on the nuScenes dataset, DiVE achieves SOTA performance
in multi-view video generation, yielding photorealistic outputs with
exceptional temporal and cross-view coherence.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: dive is a diffusion transformer-based framework for generating high-fidelity, temporally and cross-view consistent multi-view driving scene videos, featuring innovations for improved efficiency and quality.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: dive是一个基于扩散transformer的框架，用于生成高保真、时序和跨视角一致的多视角驾驶场景视频，其创新之处在于提高了效率和质量。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19614v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Junpeng Jiang, Gangyi Hong, Miao Zhang, Hengtong Hu, Kun Zhan, Rui Shao, Liqiang Nie</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.25, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Image Generation Method Based on Heat Diffusion Models</h2>
            <p class="paper-summary">Denoising Diffusion Probabilistic Models (DDPMs) achieve high-quality image
generation without adversarial training, but they process images as a whole.
Since adjacent pixels are highly likely to belong to the same object, we
propose the Heat Diffusion Model (HDM) to further preserve image details and
generate more realistic images. HDM is a model that incorporates pixel-level
operations while maintaining the same training process as DDPM. In HDM, the
discrete form of the two-dimensional heat equation is integrated into the
diffusion and generation formulas of DDPM, enabling the model to compute
relationships between neighboring pixels during image processing. Our
experiments demonstrate that HDM can generate higher-quality samples compared
to models such as DDPM, Consistency Diffusion Models (CDM), Latent Diffusion
Models (LDM), and Vector Quantized Generative Adversarial Networks (VQGAN).</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces heat diffusion models (hdm), a novel approach building upon ddpms by incorporating pixel-level relationships derived from the heat equation to enhance image detail and realism in generated samples.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种热扩散模型（hdm），它在ddpm的基础上，通过结合从热方程导出的像素级关系来增强生成样本中的图像细节和真实感。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19600v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Pengfei Zhang, Shouqing Jia</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.30000000000000004, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">SynergyAmodal: Deocclude Anything with Text Control</h2>
            <p class="paper-summary">Image deocclusion (or amodal completion) aims to recover the invisible
regions (\ie, shape and appearance) of occluded instances in images. Despite
recent advances, the scarcity of high-quality data that balances diversity,
plausibility, and fidelity remains a major obstacle. To address this challenge,
we identify three critical elements: leveraging in-the-wild image data for
diversity, incorporating human expertise for plausibility, and utilizing
generative priors for fidelity. We propose SynergyAmodal, a novel framework for
co-synthesizing in-the-wild amodal datasets with comprehensive shape and
appearance annotations, which integrates these elements through a tripartite
data-human-model collaboration. First, we design an occlusion-grounded
self-supervised learning algorithm to harness the diversity of in-the-wild
image data, fine-tuning an inpainting diffusion model into a partial completion
diffusion model. Second, we establish a co-synthesis pipeline to iteratively
filter, refine, select, and annotate the initial deocclusion results of the
partial completion diffusion model, ensuring plausibility and fidelity through
human expert guidance and prior model constraints. This pipeline generates a
high-quality paired amodal dataset with extensive category and scale diversity,
comprising approximately 16K pairs. Finally, we train a full completion
diffusion model on the synthesized dataset, incorporating text prompts as
conditioning signals. Extensive experiments demonstrate the effectiveness of
our framework in achieving zero-shot generalization and textual
controllability. Our code, dataset, and models will be made publicly available
at https://github.com/imlixinyang/SynergyAmodal.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces synergyamodal, a framework for generating a high-quality image deocclusion dataset using a data-human-model collaboration approach, enabling zero-shot generalization and text-conditional deocclusion.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 synergyamodal，一个利用数据-人类-模型协同方法生成高质量图像去遮挡数据集的框架，从而实现零样本泛化和文本条件去遮挡。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19506v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Xinyang Li, Chengjie Yi, Jiawei Lai, Mingbao Lin, Yansong Qu, Shengchuan Zhang, Liujuan Cao</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.35000000000000003, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">EarthMapper: Visual Autoregressive Models for Controllable Bidirectional Satellite-Map Translation</h2>
            <p class="paper-summary">Satellite imagery and maps, as two fundamental data modalities in remote
sensing, offer direct observations of the Earth's surface and
human-interpretable geographic abstractions, respectively. The task of
bidirectional translation between satellite images and maps (BSMT) holds
significant potential for applications in urban planning and disaster response.
However, this task presents two major challenges: first, the absence of precise
pixel-wise alignment between the two modalities substantially complicates the
translation process; second, it requires achieving both high-level abstraction
of geographic features and high-quality visual synthesis, which further
elevates the technical complexity. To address these limitations, we introduce
EarthMapper, a novel autoregressive framework for controllable bidirectional
satellite-map translation. EarthMapper employs geographic coordinate embeddings
to anchor generation, ensuring region-specific adaptability, and leverages
multi-scale feature alignment within a geo-conditioned joint scale
autoregression (GJSA) process to unify bidirectional translation in a single
training cycle. A semantic infusion (SI) mechanism is introduced to enhance
feature-level consistency, while a key point adaptive guidance (KPAG) mechanism
is proposed to dynamically balance diversity and precision during inference. We
further contribute CNSatMap, a large-scale dataset comprising 302,132 precisely
aligned satellite-map pairs across 38 Chinese cities, enabling robust
benchmarking. Extensive experiments on CNSatMap and the New York dataset
demonstrate EarthMapper's superior performance, achieving significant
improvements in visual realism, semantic consistency, and structural fidelity
over state-of-the-art methods. Additionally, EarthMapper excels in zero-shot
tasks like in-painting, out-painting and coordinate-conditional generation,
underscoring its versatility.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces earthmapper, an autoregressive framework for bidirectional satellite-map translation, addressing pixel misalignment and the need for high-level abstraction and visual synthesis. it includes a new large-scale dataset, cnsatmap, and demonstrates superior performance in various tasks.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 earthmapper，一个用于双向卫星地图转换的自回归框架，解决了像素未对齐以及对高级抽象和视觉合成的需求。它还包含一个新的大规模数据集 cnsatmap，并在各种任务中展示了卓越的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19432v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zhe Dong, Yuzhe Sun, Tianzhu Liu, Wangmeng Zuo, Yanfeng Gu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.4, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">CompleteMe: Reference-based Human Image Completion</h2>
            <p class="paper-summary">Recent methods for human image completion can reconstruct plausible body
shapes but often fail to preserve unique details, such as specific clothing
patterns or distinctive accessories, without explicit reference images. Even
state-of-the-art reference-based inpainting approaches struggle to accurately
capture and integrate fine-grained details from reference images. To address
this limitation, we propose CompleteMe, a novel reference-based human image
completion framework. CompleteMe employs a dual U-Net architecture combined
with a Region-focused Attention (RFA) Block, which explicitly guides the
model's attention toward relevant regions in reference images. This approach
effectively captures fine details and ensures accurate semantic correspondence,
significantly improving the fidelity and consistency of completed images.
Additionally, we introduce a challenging benchmark specifically designed for
evaluating reference-based human image completion tasks. Extensive experiments
demonstrate that our proposed method achieves superior visual quality and
semantic consistency compared to existing techniques. Project page:
https://liagm.github.io/CompleteMe/</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: completeme is a reference-based human image completion framework using a dual u-net architecture and region-focused attention to improve fidelity and consistency by transferring fine-grained details from reference images. they also introduce a new benchmark for this task.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: completeme是一个基于参考的人体图像补全框架，它使用双u-net架构和区域聚焦注意力机制，通过从参考图像中传输细粒度的细节来提高保真度和一致性。他们还为此任务引入了一个新的基准。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20042v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yu-Ju Tsai, Brian Price, Qing Liu, Luis Figueroa, Daniil Pakhomov, Zhihong Ding, Scott Cohen, Ming-Hsuan Yang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.45, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Learning Brenier Potentials with Convex Generative Adversarial Neural Networks</h2>
            <p class="paper-summary">Brenier proved that under certain conditions on a source and a target
probability measure there exists a strictly convex function such that its
gradient is a transport map from the source to the target distribution. This
function is called the Brenier potential. Furthermore, detailed information on
the H\"older regularity of the Brenier potential is available. In this work we
develop the statistical learning theory of generative adversarial neural
networks that learn the Brenier potential. As by the transformation of
densities formula, the density of the generated measure depends on the second
derivative of the Brenier potential, we develop the universal approximation
theory of ReCU networks with cubic activation $\mathtt{ReCU}(x)=\max\{0,x\}^3$
that combines the favorable approximation properties of H\"older functions with
a Lipschitz continuous density. In order to assure the convexity of such
general networks, we introduce an adversarial training procedure for a
potential function represented by the ReCU networks that combines the classical
discriminator cross entropy loss with a penalty term that enforces (strict)
convexity. We give a detailed decomposition of learning errors and show that
for a suitable high penalty parameter all networks chosen in the adversarial
min-max optimization problem are strictly convex. This is further exploited to
prove the consistency of the learning procedure for (slowly) expanding network
capacity. We also implement the described learning algorithm and apply it to a
number of standard test cases from Gaussian mixture to image data as target
distributions. As predicted in theory, we observe that the convexity loss
becomes inactive during the training process and the potentials represented by
the neural networks have learned convexity.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper presents a novel generative adversarial network (gan) training procedure using recu networks and adversarial training to learn brenier potentials, ensuring convexity for improved generative modeling. the method is validated on gaussian mixture and image data.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种新颖的生成对抗网络（gan）训练方法，使用recu网络和对抗训练来学习brenier势，从而确保凸性以改进生成模型。该方法在gaussian混合模型和图像数据上得到了验证。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19779v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Claudia Drygala, Hanno Gottschalk, Thomas Kruse, Ségolène Martin, Annika Mütze</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.5, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">CE-NPBG: Connectivity Enhanced Neural Point-Based Graphics for Novel View Synthesis in Autonomous Driving Scenes</h2>
            <p class="paper-summary">Current point-based approaches encounter limitations in scalability and
rendering quality when using large 3D point cloud maps because using them
directly for novel view synthesis (NVS) leads to degraded visualizations. We
identify the primary issue behind these low-quality renderings as a visibility
mismatch between geometry and appearance, stemming from using these two
modalities together. To address this problem, we present CE-NPBG, a new
approach for novel view synthesis (NVS) in large-scale autonomous driving
scenes. Our method is a neural point-based technique that leverages two
modalities: posed images (cameras) and synchronized raw 3D point clouds
(LiDAR). We first employ a connectivity relationship graph between appearance
and geometry, which retrieves points from a large 3D point cloud map observed
from the current camera perspective and uses them for rendering. By leveraging
this connectivity, our method significantly improves rendering quality and
enhances run-time and scalability by using only a small subset of points from
the large 3D point cloud map. Our approach associates neural descriptors with
the points and uses them to synthesize views. To enhance the encoding of these
descriptors and elevate rendering quality, we propose a joint adversarial and
point rasterization training. During training, we pair an image-synthesizer
network with a multi-resolution discriminator. At inference, we decouple them
and use the image-synthesizer to generate novel views. We also integrate our
proposal into the recent 3D Gaussian Splatting work to highlight its benefits
for improved rendering and scalability.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces ce-npbg, a novel view synthesis method for autonomous driving scenes that enhances rendering quality and scalability by leveraging connectivity between posed images and lidar point clouds using a neural point-based technique.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种名为 ce-npbg 的新视角合成方法，用于自动驾驶场景。该方法利用神经点云技术，通过连接姿态图像和激光雷达点云之间的关系，提高了渲染质量和可扩展性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19557v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Mohammad Altillawi, Fengyi Shen, Liudi Yang, Sai Manoj Prakhya, Ziyuan Liu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.55, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Masked Language Prompting for Generative Data Augmentation in Few-shot Fashion Style Recognition</h2>
            <p class="paper-summary">Constructing dataset for fashion style recognition is challenging due to the
inherent subjectivity and ambiguity of style concepts. Recent advances in
text-to-image models have facilitated generative data augmentation by
synthesizing images from labeled data, yet existing methods based solely on
class names or reference captions often fail to balance visual diversity and
style consistency. In this work, we propose \textbf{Masked Language Prompting
(MLP)}, a novel prompting strategy that masks selected words in a reference
caption and leverages large language models to generate diverse yet
semantically coherent completions. This approach preserves the structural
semantics of the original caption while introducing attribute-level variations
aligned with the intended style, enabling style-consistent and diverse image
generation without fine-tuning. Experimental results on the FashionStyle14
dataset demonstrate that our MLP-based augmentation consistently outperforms
class-name and caption-based baselines, validating its effectiveness for
fashion style recognition under limited supervision.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces masked language prompting (mlp) for generative data augmentation in few-shot fashion style recognition. mlp leverages large language models to generate diverse and style-consistent images from masked reference captions, outperforming existing augmentation methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了用于少样本时尚风格识别的生成数据增强的掩码语言提示（mlp）。 mlp利用大型语言模型从掩码参考字幕中生成多样且风格一致的图像，优于现有的增强方法。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19455v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yuki Hirakawa, Ryotaro Shimizu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.6000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">UNet with Axial Transformer : A Neural Weather Model for Precipitation Nowcasting</h2>
            <p class="paper-summary">Making accurate weather predictions can be particularly challenging for
localized storms or events that evolve on hourly timescales, such as
thunderstorms. Hence, our goal for the project was to model Weather Nowcasting
for making highly localized and accurate predictions that apply to the
immediate future replacing the current numerical weather models and data
assimilation systems with Deep Learning approaches. A significant advantage of
machine learning is that inference is computationally cheap given an
already-trained model, allowing forecasts that are nearly instantaneous and in
the native high resolution of the input data. In this work we developed a novel
method that employs Transformer-based machine learning models to forecast
precipitation. This approach works by leveraging axial attention mechanisms to
learn complex patterns and dynamics from time series frames. Moreover, it is a
generic framework and can be applied to univariate and multivariate time series
data, as well as time series embeddings data. This paper represents an initial
research on the dataset used in the domain of next frame prediciton, and hence,
we demonstrate state-of-the-art results in terms of metrices (PSNR = 47.67,
SSIM = 0.9943) used for the given dataset using UNet with Axial Transformer.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a unet with axial transformer model for precipitation nowcasting, aiming to replace traditional numerical weather models with deep learning for highly localized and accurate predictions, achieving state-of-the-art results on a specific dataset.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种用于降水临近预报的带有轴向transformer的unet模型，旨在用深度学习取代传统的数值天气模型，以实现高度局部化和精确的预测，并在特定数据集上取得了最先进的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19408v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Maitreya Sonawane, Sumit Mamtani</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.65, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Boosting 3D Liver Shape Datasets with Diffusion Models and Implicit Neural Representations</h2>
            <p class="paper-summary">While the availability of open 3D medical shape datasets is increasing,
offering substantial benefits to the research community, we have found that
many of these datasets are, unfortunately, disorganized and contain artifacts.
These issues limit the development and training of robust models, particularly
for accurate 3D reconstruction tasks. In this paper, we examine the current
state of available 3D liver shape datasets and propose a solution using
diffusion models combined with implicit neural representations (INRs) to
augment and expand existing datasets. Our approach utilizes the generative
capabilities of diffusion models to create realistic, diverse 3D liver shapes,
capturing a wide range of anatomical variations and addressing the problem of
data scarcity. Experimental results indicate that our method enhances dataset
diversity, providing a scalable solution to improve the accuracy and
reliability of 3D liver reconstruction and generation in medical applications.
Finally, we suggest that diffusion models can also be applied to other
downstream tasks in 3D medical imaging.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper proposes using diffusion models and implicit neural representations to augment 3d liver shape datasets, addressing issues of data scarcity and dataset quality to improve 3d liver reconstruction.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文提出了一种使用扩散模型和隐式神经表示来扩充3d肝脏形状数据集的方法，解决了数据稀缺和数据集质量问题，从而提高3d肝脏重建的准确性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19402v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Khoa Tuan Nguyen, Francesca Tozzi, Wouter Willaert, Joris Vankerschaver, Nikdokht Rashidian, Wesley De Neve</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.7000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">HumMorph: Generalized Dynamic Human Neural Fields from Few Views</h2>
            <p class="paper-summary">We introduce HumMorph, a novel generalized approach to free-viewpoint
rendering of dynamic human bodies with explicit pose control. HumMorph renders
a human actor in any specified pose given a few observed views (starting from
just one) in arbitrary poses. Our method enables fast inference as it relies
only on feed-forward passes through the model. We first construct a coarse
representation of the actor in the canonical T-pose, which combines visual
features from individual partial observations and fills missing information
using learned prior knowledge. The coarse representation is complemented by
fine-grained pixel-aligned features extracted directly from the observed views,
which provide high-resolution appearance information. We show that HumMorph is
competitive with the state-of-the-art when only a single input view is
available, however, we achieve results with significantly better visual quality
given just 2 monocular observations. Moreover, previous generalized methods
assume access to accurate body shape and pose parameters obtained using
synchronized multi-camera setups. In contrast, we consider a more practical
scenario where these body parameters are noisily estimated directly from the
observed views. Our experimental results demonstrate that our architecture is
more robust to errors in the noisy parameters and clearly outperforms the state
of the art in this setting.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: hummorph introduces a novel neural field approach for rendering dynamic human bodies from few views with explicit pose control, demonstrating robustness to noisy pose parameters and achieving state-of-the-art performance with limited input views.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: hummorph 提出了一种新颖的神经场方法，用于从少量视图渲染具有显式姿势控制的动态人体，展示了对噪声姿势参数的鲁棒性，并在有限的输入视图下实现了最先进的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19390v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jakub Zadrożny, Hakan Bilen</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.75, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Mitigating Catastrophic Forgetting in the Incremental Learning of Medical Images</h2>
            <p class="paper-summary">This paper proposes an Incremental Learning (IL) approach to enhance the
accuracy and efficiency of deep learning models in analyzing T2-weighted (T2w)
MRI medical images prostate cancer detection using the PI-CAI dataset. We used
multiple health centers' artificial intelligence and radiology data, focused on
different tasks that looked at prostate cancer detection using MRI (PI-CAI). We
utilized Knowledge Distillation (KD), as it employs generated images from past
tasks to guide the training of models for subsequent tasks. The approach
yielded improved performance and faster convergence of the models. To
demonstrate the versatility and robustness of our approach, we evaluated it on
the PI-CAI dataset, a diverse set of medical imaging modalities including OCT
and PathMNIST, and the benchmark continual learning dataset CIFAR-10. Our
results indicate that KD can be a promising technique for IL in medical image
analysis in which data is sourced from individual health centers and the
storage of large datasets is not feasible. By using generated images from prior
tasks, our method enables the model to retain and apply previously acquired
knowledge without direct access to the original data.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces an incremental learning (il) approach using knowledge distillation (kd) with generated images to mitigate catastrophic forgetting in medical image analysis, particularly for prostate cancer detection in mri. it demonstrates the method's versatility on pi-cai, oct, pathmnist, and cifar-10 datasets.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文提出了一种增量学习（il）方法，使用知识蒸馏（kd）与生成的图像，以减轻医学图像分析中的灾难性遗忘，特别是前列腺癌的mri检测。 它在pi-cai，oct，pathmnist和cifar-10数据集上展示了该方法的多功能性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20033v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Sara Yavari, Jacob Furst</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.8, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">CoDEx: Combining Domain Expertise for Spatial Generalization in Satellite Image Analysis</h2>
            <p class="paper-summary">Global variations in terrain appearance raise a major challenge for satellite
image analysis, leading to poor model performance when training on locations
that differ from those encountered at test time. This remains true even with
recent large global datasets. To address this challenge, we propose a novel
domain-generalization framework for satellite images. Instead of trying to
learn a single generalizable model, we train one expert model per training
domain, while learning experts' similarity and encouraging similar experts to
be consistent. A model selection module then identifies the most suitable
experts for a given test sample and aggregates their predictions. Experiments
on four datasets (DynamicEarthNet, MUDS, OSCD, and FMoW) demonstrate consistent
gains over existing domain generalization and adaptation methods. Our code is
publicly available at https://github.com/Abhishek19009/CoDEx.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces codex, a domain-generalization framework that trains separate expert models for each training domain in satellite imagery and then selects and aggregates predictions from the most suitable experts for new test samples, demonstrating improved performance across multiple datasets.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文提出了codex，一个领域泛化框架，它为卫星图像中的每个训练领域训练单独的专家模型，然后选择并聚合来自最适合新测试样本的专家的预测，并在多个数据集上展示了改进的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19737v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Abhishek Kuriyal, Elliot Vincent, Mathieu Aubry, Loic Landrieu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.8500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Neural network task specialization via domain constraining</h2>
            <p class="paper-summary">This paper introduces a concept of neural network specialization via
task-specific domain constraining, aimed at enhancing network performance on
data subspace in which the network operates. The study presents experiments on
training specialists for image classification and object detection tasks. The
results demonstrate that specialization can enhance a generalist's accuracy
even without additional data or changing training regimes: solely by
constraining class label space in which the network performs. Theoretical and
experimental analyses indicate that effective specialization requires modifying
traditional fine-tuning methods and constraining data space to semantically
coherent subsets. The specialist extraction phase before tuning the network is
proposed for maximal performance gains. We also provide analysis of the
evolution of the feature space during specialization. This study paves way to
future research for developing more advanced dynamically configurable image
analysis systems, where computations depend on the specific input.
Additionally, the proposed methods can help improve system performance in
scenarios where certain data domains should be excluded from consideration of
the generalist network.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper explores neural network specialization by constraining the class label space during fine-tuning, leading to improved performance in image classification and object detection, even without new data. they propose a specialist extraction phase for performance gains.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文通过在微调期间约束类别标签空间来探索神经网络的专业化，从而提高图像分类和对象检测的性能，即使没有新数据。他们提出一个专家提取阶段以获得性能提升。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(4/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(5/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19592v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Roman Malashin, Daniil Ilyukhin</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.9, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">DEEMO: De-identity Multimodal Emotion Recognition and Reasoning</h2>
            <p class="paper-summary">Emotion understanding is a critical yet challenging task. Most existing
approaches rely heavily on identity-sensitive information, such as facial
expressions and speech, which raises concerns about personal privacy. To
address this, we introduce the De-identity Multimodal Emotion Recognition and
Reasoning (DEEMO), a novel task designed to enable emotion understanding using
de-identified video and audio inputs. The DEEMO dataset consists of two
subsets: DEEMO-NFBL, which includes rich annotations of Non-Facial Body
Language (NFBL), and DEEMO-MER, an instruction dataset for Multimodal Emotion
Recognition and Reasoning using identity-free cues. This design supports
emotion understanding without compromising identity privacy. In addition, we
propose DEEMO-LLaMA, a Multimodal Large Language Model (MLLM) that integrates
de-identified audio, video, and textual information to enhance both emotion
recognition and reasoning. Extensive experiments show that DEEMO-LLaMA achieves
state-of-the-art performance on both tasks, outperforming existing MLLMs by a
significant margin, achieving 74.49% accuracy and 74.45% F1-score in
de-identity emotion recognition, and 6.20 clue overlap and 7.66 label overlap
in de-identity emotion reasoning. Our work contributes to ethical AI by
advancing privacy-preserving emotion understanding and promoting responsible
affective computing.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces deemo, a new task and dataset for de-identified multimodal emotion recognition and reasoning, along with deemo-llama, an mllm that outperforms existing models on this task, contributing to privacy-preserving affective computing.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了deemo，一个用于去身份化多模态情感识别和推理的新任务和数据集，以及deemo-llama，一个在此任务上优于现有模型的mllm，为保护隐私的情感计算做出了贡献。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19549v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Deng Li, Bohao Xing, Xin Liu, Baiqiang Xia, Bihan Wen, Heikki Kälviäinen</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.9500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Adversarial Shallow Watermarking</h2>
            <p class="paper-summary">Recent advances in digital watermarking make use of deep neural networks for
message embedding and extraction. They typically follow the ``encoder-noise
layer-decoder''-based architecture. By deliberately establishing a
differentiable noise layer to simulate the distortion of the watermarked
signal, they jointly train the deep encoder and decoder to fit the noise layer
to guarantee robustness. As a result, they are usually weak against unknown
distortions that are not used in their training pipeline. In this paper, we
propose a novel watermarking framework to resist unknown distortions, namely
Adversarial Shallow Watermarking (ASW). ASW utilizes only a shallow decoder
that is randomly parameterized and designed to be insensitive to distortions
for watermarking extraction. During the watermark embedding, ASW freezes the
shallow decoder and adversarially optimizes a host image until its updated
version (i.e., the watermarked image) stably triggers the shallow decoder to
output the watermark message. During the watermark extraction, it accurately
recovers the message from the watermarked image by leveraging the insensitive
nature of the shallow decoder against arbitrary distortions. Our ASW is
training-free, encoder-free, and noise layer-free. Experiments indicate that
the watermarked images created by ASW have strong robustness against various
unknown distortions. Compared to the existing ``encoder-noise layer-decoder''
approaches, ASW achieves comparable results on known distortions and better
robustness on unknown distortions.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces adversarial shallow watermarking (asw), a training-free watermarking method that uses an adversarially optimized host image and a shallow decoder to achieve robustness against unknown distortions, outperforming existing methods in this aspect.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为对抗性浅层水印（asw）的免训练水印方法，该方法使用对抗性优化的宿主图像和一个浅层解码器，以实现对未知失真的鲁棒性，并且在这方面优于现有方法。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19529v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Guobiao Li, Lei Tan, Yuliang Xue, Gaozhi Liu, Zhenxing Qian, Sheng Li, Xinpeng Zhang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Enhancing Quality for VVC Compressed Videos with Omniscient Quality Enhancement Model</h2>
            <p class="paper-summary">The latest video coding standard H.266/VVC has shown its great improvement in
terms of compression performance when compared to its predecessor HEVC
standard. Though VVC was implemented with many advanced techniques, it still
met the same challenges as its predecessor due to the need for even higher
perceptual quality demand at the decoder side as well as the compression
performance at the encoder side. The advancement of Artificial Intelligence
(AI) technology, notably the deep learning-based video quality enhancement
methods, was shown to be a promising approach to improving the perceptual
quality experience. In this paper, we propose a novel Omniscient video quality
enhancement Network for VVC compressed Videos. The Omniscient Network for
compressed video quality enhancement was originally designed for HEVC
compressed videos in which not only the spatial-temporal features but also
cross-frequencies information were employed to augment the visual quality.
Inspired by this work, we propose a modification of the OVQE model and
integrate it into the lasted STD-VVC (Standard Versatile Video Coding) decoder
architecture. As assessed in a rich set of test conditions, the proposed
OVQE-VVC solution is able to achieve significant PSNR improvement, notably
around 0.74 dB and up to 1.2 dB with respect to the original STD-VVC codec.
This also corresponds to around 19.6% of bitrate saving while keeping a similar
quality observation.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper proposes a modified omniscient video quality enhancement (ovqe) model integrated into the vvc decoder to improve the perceptual quality of compressed videos, achieving significant psnr improvements and bitrate savings.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种改进的全知视频质量增强（ovqe）模型，并将其集成到vvc解码器中，以提高压缩视频的感知质量，从而显著提高psnr并节省比特率。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(5/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19935v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Xiem HoangVan, Hieu Bui Minh, Sang NguyenQuang, Wen-Hsiao Peng</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Federated Out-of-Distribution Generalization: A Causal Augmentation View</h2>
            <p class="paper-summary">Federated learning aims to collaboratively model by integrating multi-source
information to obtain a model that can generalize across all client data.
Existing methods often leverage knowledge distillation or data augmentation to
mitigate the negative impact of data bias across clients. However, the limited
performance of teacher models on out-of-distribution samples and the inherent
quality gap between augmented and original data hinder their effectiveness and
they typically fail to leverage the advantages of incorporating rich contextual
information. To address these limitations, this paper proposes a Federated
Causal Augmentation method, termed FedCAug, which employs causality-inspired
data augmentation to break the spurious correlation between attributes and
categories. Specifically, it designs a causal region localization module to
accurately identify and decouple the background and objects in the image,
providing rich contextual information for causal data augmentation.
Additionally, it designs a causality-inspired data augmentation module that
integrates causal features and within-client context to generate counterfactual
samples. This significantly enhances data diversity, and the entire process
does not require any information sharing between clients, thereby contributing
to the protection of data privacy. Extensive experiments conducted on three
datasets reveal that FedCAug markedly reduces the model's reliance on
background to predict sample labels, achieving superior performance compared to
state-of-the-art methods.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces fedcaug, a federated learning method that uses causality-inspired data augmentation to improve out-of-distribution generalization by breaking spurious correlations between attributes and categories without inter-client information sharing.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了fedcaug，一种联邦学习方法，它使用因果关系驱动的数据增强来提高分布外泛化能力，通过打破属性和类别之间的虚假相关性，且无需客户端间的信息共享。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19882v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Runhui Zhang, Sijin Zhou, Zhuang Qi</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">NSegment : Noisy Segment Improves Remote Sensing Image Segmentation</h2>
            <p class="paper-summary">Labeling errors in remote sensing (RS) image segmentation datasets often
remain implicit and subtle due to ambiguous class boundaries, mixed pixels,
shadows, complex terrain features, and subjective annotator bias. Furthermore,
the scarcity of annotated RS data due to high image acquisition and labeling
costs complicates training noise-robust models. While sophisticated mechanisms
such as label selection or noise correction might address this issue, they tend
to increase training time and add implementation complexity. In this letter, we
propose NSegment-a simple yet effective data augmentation solution to mitigate
this issue. Unlike traditional methods, it applies elastic transformations only
to segmentation labels, varying deformation intensity per sample in each
training epoch to address annotation inconsistencies. Experimental results
demonstrate that our approach improves the performance of RS image segmentation
on various state-of-the-art models.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces nsegment, a data augmentation technique that applies elastic transformations to noisy remote sensing image segmentation labels, improving model performance without increasing training complexity.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种名为nsegment的数据增强技术，该技术对有噪声的遥感图像分割标签应用弹性变换，从而在不增加训练复杂性的前提下提高模型性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(5/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19634v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yechan Kim, DongHo Yoon, SooYeon Kim, Moongu Jeon</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.1500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Lightweight Adapter Learning for More Generalized Remote Sensing Change Detection</h2>
            <p class="paper-summary">Deep learning methods have shown promising performances in remote sensing
image change detection (CD). However, existing methods usually train a
dataset-specific deep network for each dataset. Due to the significant
differences in the data distribution and labeling between various datasets, the
trained dataset-specific deep network has poor generalization performances on
other datasets. To solve this problem, this paper proposes a change adapter
network (CANet) for a more universal and generalized CD. CANet contains
dataset-shared and dataset-specific learning modules. The former explores the
discriminative features of images, and the latter designs a lightweight adapter
model, to deal with the characteristics of different datasets in data
distribution and labeling. The lightweight adapter can quickly generalize the
deep network for new CD tasks with a small computation cost. Specifically, this
paper proposes an interesting change region mask (ICM) in the adapter, which
can adaptively focus on interested change objects and decrease the influence of
labeling differences in various datasets. Moreover, CANet adopts a unique batch
normalization layer for each dataset to deal with data distribution
differences. Compared with existing deep learning methods, CANet can achieve
satisfactory CD performances on various datasets simultaneously. Experimental
results on several public datasets have verified the effectiveness and
advantages of the proposed CANet on CD. CANet has a stronger generalization
ability, smaller training costs (merely updating 4.1%-7.7% parameters), and
better performances under limited training datasets than other deep learning
methods, which also can be flexibly inserted with existing deep models.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces a change adapter network (canet) for remote sensing change detection that uses lightweight adapters and dataset-specific normalization to improve generalization across different datasets while minimizing computational costs.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种用于遥感变化检测的变化适配器网络（canet），它使用轻量级适配器和数据集特定的归一化来提高不同数据集之间的泛化能力，同时最大限度地降低计算成本。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.19598v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Dou Quan, Rufan Zhou, Shuang Wang, Ning Huyan, Dong Zhao, Yunan Li, Licheng Jiao</p>
            
        </motion.div>
        
    </div>

    <footer class="footer">
        Generated on 2025-05-01 04:31:04 UTC. Powered by <a href="https://github.com/onion-liu" target="_blank">onion-liu</a>.
    </footer>

</body>
</html>